In [1]:

    
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory



In [2]:

    
import os

print(os.listdir("C:\\Users\\ajaohri\\Desktop\\all"))

# Any results you write to the current directory are saved as output.









    



['gender_submission.csv', 'test.csv', 'train.csv']



In [3]:

    
url_train = 'C:\\Users\\ajaohri\\Desktop\\all/train.csv'
titanic = pd.read_csv(url_train)
titanic.head()









    Out[3]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [4]:

    
#Checking if our target variable is binary or not
sb.countplot(x='Survived',data=titanic)









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x2179d4faa20>



In [5]:

    
#Checking Null values
titanic.isnull().sum()









    Out[5]:





PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Dropping PassengerId, Name and Ticket because they are unique. Dropping Cabin because of too many null values.



In [6]:

    
titanic_data = titanic.drop(['PassengerId','Name','Ticket'],1)
titanic_data.head()

Now need to take care of the missing data for Age variable. Need to approximate- one way, to take mean age for all the missing values. Or, find if age is related to Pclass, and assign respective means.



In [7]:

    
sb.boxplot(x='Pclass',y='Age',data=titanic_data)









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x2179f5a91d0>

If Passenger belongs to Pclass 3, age assigned is 24, if 2, age is assigned 29, if 1 then 37.



In [8]:

    
def age_approx(cols):
    age = cols[0]
    pclass = cols[1]
    if pd.isnull(age):
        if pclass == 1:
            return 37
        elif pclass == 2:
            return 29
        else:
            return 24
    else:
        return age



In [9]:

    
titanic_data['Age'] = titanic_data[['Age', 'Pclass']].apply(age_approx, axis=1)
titanic_data.isnull().sum()









    Out[9]:





Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64



In [10]:

    
def cabin_approx(cols):
    cabin = cols[0]
    pclass = cols[1]
    if pd.isnull(cabin):
        return 0
    elif cabin[0] == ('C' or 'B'):
        return 3
    elif cabin[0] == ('A' or 'D' or 'E' or 'T'):
        return 2
    elif cabin[0] == ('F' or 'G'):
        return 1
    else:
        return 0



In [11]:

    
titanic_data['Cabin'] = titanic_data[['Cabin', 'Pclass']].apply(cabin_approx, axis=1)
#titanic_data.isnull().sum()
sb.boxplot(x='Cabin',y='Fare',data=titanic_data)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x2179f673f60>

There are two null values in Embarked, we can just drop them.



In [12]:

    
titanic_data.dropna(inplace=True)
titanic_data.isnull().sum()









    Out[12]:





Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

Getting dummy variables from categorical ones.



In [13]:

    
gender = pd.get_dummies(titanic_data['Sex'],drop_first=True)
gender.head()



In [14]:

    
embark_location = pd.get_dummies(titanic_data['Embarked'],drop_first=True)
embark_location.head()



In [15]:

    
titanic_data.drop(['Sex','Embarked'],axis=1,inplace=True)
titanic_data.head()



In [16]:

    
titanic_dmy = pd.concat([titanic_data, gender, embark_location],axis=1)
titanic_dmy.tail()



In [17]:

    
#Checking for correlation between variables.
sb.heatmap(titanic_dmy.corr(),square=True)
#print(titanic_dmy.corr())









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x2179fa0be10>



In [18]:

    
X = titanic_dmy.ix[:,(1,2,3,4,5,6,7,8,9)].values
y = titanic_dmy.ix[:,0].values
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=2)









    



C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

The train test split is done for parameter tuning. We now deploy the models.



In [20]:

    
!pip install xgboost









    



Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/03/b8/0fcc6d3f28f45c5d1ef33fecca4b3bbcf8c4f53bebdc9b9146dad8fda64a/xgboost-0.80-py2.py3-none-win_amd64.whl (7.1MB)
Requirement already satisfied: scipy in c:\users\ajaohri\appdata\local\continuum\anaconda3\lib\site-packages (from xgboost) (1.1.0)
Requirement already satisfied: numpy in c:\users\ajaohri\appdata\local\continuum\anaconda3\lib\site-packages (from xgboost) (1.15.0)
Installing collected packages: xgboost
Successfully installed xgboost-0.80



In [24]:

    
from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier

clf1 = SVC(kernel='linear',C=1.0,random_state=3)
clf2 = XGBClassifier(random_state=3)
clf3 = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=300)
eclf = VotingClassifier(estimators=[('clf1', clf1), ('clf2', clf2),('clf3',clf3)], voting='hard')

eclf.fit(X_train, y_train)
y_pred = eclf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(eclf.score(X_test, y_test))









    



[[47  5]
 [11 26]]
0.8202247191011236






    



C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

Now taking in Competition Data.



In [25]:

    
url = 'C:\\Users\\ajaohri\\Desktop\\all/test.csv'
test = pd.read_csv(url)
test.head()









    Out[25]:







  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      892
      3
      Kelly, Mr. James
      male
      34.5
      0
      0
      330911
      7.8292
      NaN
      Q
    
    
      1
      893
      3
      Wilkes, Mrs. James (Ellen Needs)
      female
      47.0
      1
      0
      363272
      7.0000
      NaN
      S
    
    
      2
      894
      2
      Myles, Mr. Thomas Francis
      male
      62.0
      0
      0
      240276
      9.6875
      NaN
      Q
    
    
      3
      895
      3
      Wirz, Mr. Albert
      male
      27.0
      0
      0
      315154
      8.6625
      NaN
      S
    
    
      4
      896
      3
      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
      female
      22.0
      1
      1
      3101298
      12.2875
      NaN
      S



In [26]:

    
test.isnull().sum()









    Out[26]:





PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

There are 86 null values in Age, so we approximate them like we did earlier. There are 327 null values in Cabin, so we drop it altogether. There is 1 null value in Fare, so we approximate it according to the median of each class of the null position.



In [27]:

    
test.describe()









    Out[27]:







  
    
      
      PassengerId
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      418.000000
      418.000000
      332.000000
      418.000000
      418.000000
      417.000000
    
    
      mean
      1100.500000
      2.265550
      30.272590
      0.447368
      0.392344
      35.627188
    
    
      std
      120.810458
      0.841838
      14.181209
      0.896760
      0.981429
      55.907576
    
    
      min
      892.000000
      1.000000
      0.170000
      0.000000
      0.000000
      0.000000
    
    
      25%
      996.250000
      1.000000
      21.000000
      0.000000
      0.000000
      7.895800
    
    
      50%
      1100.500000
      3.000000
      27.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      1204.750000
      3.000000
      39.000000
      1.000000
      0.000000
      31.500000
    
    
      max
      1309.000000
      3.000000
      76.000000
      8.000000
      9.000000
      512.329200



In [28]:

    
sb.set(rc={'figure.figsize':(11.7,8.27)})
ax = sb.boxplot(x='Pclass',y='Fare',data=test,width=0.9)



In [29]:

    
def fare_approx(cols):
    fare = cols[0]
    pclass = cols[1]
    if pd.isnull(fare):
        if pclass == 1:
            return 55
        elif pclass == 2:
            return 20
        else:
            return 10
    else:
        return fare

Cleaning up the test data: Dropping variables, approximating age and fare, dummy variables.



In [30]:

    
test_data = test.drop(['Name','Ticket'],1)
test_data['Age'] = test_data[['Age', 'Pclass']].apply(age_approx, axis=1)
test_data['Fare'] = test_data[['Fare','Pclass']].apply(fare_approx, axis=1)
test_data['Cabin'] = test_data[['Cabin','Pclass']].apply(cabin_approx, axis=1)
#
gender_test = pd.get_dummies(test_data['Sex'],drop_first=True)
embark_location_test = pd.get_dummies(test_data['Embarked'],drop_first=True)
test_data.drop(['Sex','Embarked'],axis=1,inplace=True)
test_dmy = pd.concat([test_data, gender_test, embark_location_test],axis=1)

#test_dmy.describe()
test_data.dropna(inplace=True)
test_dmy.isnull().sum()









    Out[30]:





PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
male           0
Q              0
S              0
dtype: int64



In [31]:

    
test_dmy.head()









    Out[31]:







  
    
      
      PassengerId
      Pclass
      Age
      SibSp
      Parch
      Fare
      Cabin
      male
      Q
      S
    
  
  
    
      0
      892
      3
      34.5
      0
      0
      7.8292
      0
      1
      1
      0
    
    
      1
      893
      3
      47.0
      1
      0
      7.0000
      0
      0
      0
      1
    
    
      2
      894
      2
      62.0
      0
      0
      9.6875
      0
      1
      1
      0
    
    
      3
      895
      3
      27.0
      0
      0
      8.6625
      0
      1
      0
      1
    
    
      4
      896
      3
      22.0
      1
      1
      12.2875
      0
      0
      0
      1



In [32]:

    
X_competition = test_dmy.ix[:,(1,2,3,4,5,6,7,8,9)].values









    



C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.

Prediction for Competition Data



In [33]:

    
y_comp = eclf.predict(X_competition)









    



C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\ajaohri\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:



In [34]:

    
submission = pd.DataFrame({'PassengerId':test_data['PassengerId'],'Survived':y_comp})
submission.head()









    Out[34]:







  
    
      
      PassengerId
      Survived
    
  
  
    
      0
      892
      0
    
    
      1
      893
      0
    
    
      2
      894
      0
    
    
      3
      895
      0
    
    
      4
      896
      1



In [35]:

    
filename = 'Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)









    



Saved file: Titanic Predictions 1.csv



In [38]:

    
os.getcwd()









    Out[38]:





'C:\\Users\\ajaohri'



In [ ]:

	Survived	Pclass	Age	SibSp	Parch	Fare	Cabin	male	Q	S
886	0	2	27.0	0	0	13.00	0	1	0	1
887	1	1	19.0	0	0	30.00	0	0	0	1
888	0	3	24.0	1	2	23.45	0	0	0	1
889	1	1	26.0	0	0	30.00	3	1	0	0
890	0	3	32.0	0	0	7.75	0	1	1	0

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200